bitkeeper revision 1.1345 (4266bd05lHlHunb0CEvOq60j2DvKCQ)
authorleendert@watson.ibm.com[iap10] <leendert@watson.ibm.com[iap10]>
Wed, 20 Apr 2005 20:35:17 +0000 (20:35 +0000)
committerleendert@watson.ibm.com[iap10] <leendert@watson.ibm.com[iap10]>
Wed, 20 Apr 2005 20:35:17 +0000 (20:35 +0000)
[PATCH] VMX world switch

The attached code implements a VMX world switch to vmxassist (a small assist
module residing in a VMX enabled partition where it is responsible for
emulating real mode) whever CR0.PE is disabled.

The patch temporarily disables the PGE feature flag in cpuid as it is
currently broken (try running an unmodified 2.6 kernel that sets PGE in
mm/init.c/paging_init()).

The patch adds consistency checks before setting the ARCH_VMX_IO_WAIT state
to detect race conditions on SMP systems.

Signed-Off-By: Leendert van Doorn <leendert@watson.ibm.com>
Signed-off-by: ian@xensource.com
.rootkeys
xen/arch/x86/vmx.c
xen/arch/x86/vmx_platform.c
xen/include/asm-x86/vmx_vmcs.h
xen/include/public/vmx_assist.h [new file with mode: 0644]

index 66b981152c5cdb3682b9d3058f862440eb40a1cd..46c7621e13b48728fdd2bbf5d41d8b0d2e459b78 100644 (file)
--- a/.rootkeys
+++ b/.rootkeys
 4051db79512nOCGweabrFWO2M2h5ng xen/include/public/physdev.h
 40589968wmhPmV5-ENbBYmMjnedgKw xen/include/public/sched_ctl.h
 404f3d2eR2Owk-ZcGOx9ULGHg3nrww xen/include/public/trace.h
+4266bd01Ul-pC01ZVvBkhBnv5eqzvw xen/include/public/vmx_assist.h
 3ddb79c25UE59iu4JJcbRalx95mvcg xen/include/public/xen.h
 3e397e66m2tO3s-J8Jnr7Ws_tGoPTg xen/include/xen/ac_timer.h
 40715b2epYl2jBbxzz9CI2rgIca7Zg xen/include/xen/acpi.h
index e6750ae8883293a3c7bf2964e8a444a13a566b52..5218f53d50c5580417b47f0ff1df301438c085bc 100644 (file)
@@ -195,6 +195,7 @@ static void vmx_vmexit_do_cpuid(unsigned long input, struct xen_regs *regs)
     cpuid(input, &eax, &ebx, &ecx, &edx);
 
     if (input == 1) {
+        clear_bit(X86_FEATURE_PGE, &edx); /* temporarily disabled */
         clear_bit(X86_FEATURE_PSE, &edx);
         clear_bit(X86_FEATURE_PAE, &edx);
         clear_bit(X86_FEATURE_PSE36, &edx);
@@ -382,10 +383,261 @@ static void vmx_io_instruction(struct xen_regs *regs,
     do_block();
 }
 
-static int
-vm86assist(struct exec_domain *d)
+enum { COPY_IN = 0, COPY_OUT };
+
+static inline int
+vmx_copy(void *buf, unsigned long laddr, int size, int dir)
+{
+    unsigned char *addr;
+    unsigned long mfn;
+
+    if ((size + (laddr & (PAGE_SIZE - 1))) >= PAGE_SIZE) {
+       printf("vmx_copy exceeds page boundary\n");
+       return 0;
+    }
+
+    mfn = phys_to_machine_mapping(gva_to_gpte(laddr) >> PAGE_SHIFT);
+    addr = map_domain_mem((mfn << PAGE_SHIFT) | (laddr & ~PAGE_MASK));
+
+    if (dir == COPY_IN)
+           memcpy(buf, addr, size);
+    else
+           memcpy(addr, buf, size);
+
+    unmap_domain_mem(addr);
+    return 1;
+}
+
+int
+vmx_world_save(struct exec_domain *d, struct vmx_assist_context *c)
 {
-    /* stay tuned ... */
+    unsigned long inst_len;
+    int error = 0;
+
+    error |= __vmread(INSTRUCTION_LEN, &inst_len);
+    error |= __vmread(GUEST_EIP, &c->eip);
+    c->eip += inst_len; /* skip transition instruction */
+    error |= __vmread(GUEST_ESP, &c->esp);
+    error |= __vmread(GUEST_EFLAGS, &c->eflags);
+
+    error |= __vmread(CR0_READ_SHADOW, &c->cr0);
+    c->cr3 = d->arch.arch_vmx.cpu_cr3;
+    error |= __vmread(CR4_READ_SHADOW, &c->cr4);
+
+    error |= __vmread(GUEST_IDTR_LIMIT, &c->idtr_limit);
+    error |= __vmread(GUEST_IDTR_BASE, &c->idtr_base);
+
+    error |= __vmread(GUEST_GDTR_LIMIT, &c->gdtr_limit);
+    error |= __vmread(GUEST_GDTR_BASE, &c->gdtr_base);
+
+    error |= __vmread(GUEST_CS_SELECTOR, &c->cs_sel);
+    error |= __vmread(GUEST_CS_LIMIT, &c->cs_limit);
+    error |= __vmread(GUEST_CS_BASE, &c->cs_base);
+    error |= __vmread(GUEST_CS_AR_BYTES, &c->cs_arbytes.bytes);
+
+    error |= __vmread(GUEST_DS_SELECTOR, &c->ds_sel);
+    error |= __vmread(GUEST_DS_LIMIT, &c->ds_limit);
+    error |= __vmread(GUEST_DS_BASE, &c->ds_base);
+    error |= __vmread(GUEST_DS_AR_BYTES, &c->ds_arbytes.bytes);
+
+    error |= __vmread(GUEST_ES_SELECTOR, &c->es_sel);
+    error |= __vmread(GUEST_ES_LIMIT, &c->es_limit);
+    error |= __vmread(GUEST_ES_BASE, &c->es_base);
+    error |= __vmread(GUEST_ES_AR_BYTES, &c->es_arbytes.bytes);
+
+    error |= __vmread(GUEST_SS_SELECTOR, &c->ss_sel);
+    error |= __vmread(GUEST_SS_LIMIT, &c->ss_limit);
+    error |= __vmread(GUEST_SS_BASE, &c->ss_base);
+    error |= __vmread(GUEST_SS_AR_BYTES, &c->ss_arbytes.bytes);
+
+    error |= __vmread(GUEST_FS_SELECTOR, &c->fs_sel);
+    error |= __vmread(GUEST_FS_LIMIT, &c->fs_limit);
+    error |= __vmread(GUEST_FS_BASE, &c->fs_base);
+    error |= __vmread(GUEST_FS_AR_BYTES, &c->fs_arbytes.bytes);
+
+    error |= __vmread(GUEST_GS_SELECTOR, &c->gs_sel);
+    error |= __vmread(GUEST_GS_LIMIT, &c->gs_limit);
+    error |= __vmread(GUEST_GS_BASE, &c->gs_base);
+    error |= __vmread(GUEST_GS_AR_BYTES, &c->gs_arbytes.bytes);
+
+    error |= __vmread(GUEST_TR_SELECTOR, &c->tr_sel);
+    error |= __vmread(GUEST_TR_LIMIT, &c->tr_limit);
+    error |= __vmread(GUEST_TR_BASE, &c->tr_base);
+    error |= __vmread(GUEST_TR_AR_BYTES, &c->tr_arbytes.bytes);
+
+    error |= __vmread(GUEST_LDTR_SELECTOR, &c->ldtr_sel);
+    error |= __vmread(GUEST_LDTR_LIMIT, &c->ldtr_limit);
+    error |= __vmread(GUEST_LDTR_BASE, &c->ldtr_base);
+    error |= __vmread(GUEST_LDTR_AR_BYTES, &c->ldtr_arbytes.bytes);
+
+    return !error;
+}
+
+int
+vmx_world_restore(struct exec_domain *d, struct vmx_assist_context *c)
+{
+    unsigned long mfn, old_cr4;
+    int error = 0;
+
+    error |= __vmwrite(GUEST_EIP, c->eip);
+    error |= __vmwrite(GUEST_ESP, c->esp);
+    error |= __vmwrite(GUEST_EFLAGS, c->eflags);
+
+    error |= __vmwrite(CR0_READ_SHADOW, c->cr0);
+
+    if (c->cr3 == d->arch.arch_vmx.cpu_cr3) {
+       /* 
+        * This is simple TLB flush, implying the guest has 
+        * removed some translation or changed page attributes.
+        * We simply invalidate the shadow.
+        */
+       mfn = phys_to_machine_mapping(c->cr3 >> PAGE_SHIFT);
+       if ((mfn << PAGE_SHIFT) != pagetable_val(d->arch.guest_table)) {
+           VMX_DBG_LOG(DBG_LEVEL_VMMU, "Invalid CR3 value=%lx", c->cr3);
+           domain_crash_synchronous();
+           return 0;
+       }
+       shadow_sync_all(d->domain);
+    } else {
+       /*
+        * If different, make a shadow. Check if the PDBR is valid
+        * first.
+        */
+       VMX_DBG_LOG(DBG_LEVEL_VMMU, "CR3 c->cr3 = %lx", c->cr3);
+       if ((c->cr3 >> PAGE_SHIFT) > d->domain->max_pages) {
+           VMX_DBG_LOG(DBG_LEVEL_VMMU, "Invalid CR3 value=%lx", c->cr3);
+           domain_crash_synchronous(); 
+           return 0;
+       }
+       mfn = phys_to_machine_mapping(c->cr3 >> PAGE_SHIFT);
+       d->arch.guest_table = mk_pagetable(mfn << PAGE_SHIFT);
+       update_pagetables(d);
+       /* 
+        * arch.shadow_table should now hold the next CR3 for shadow
+        */
+       d->arch.arch_vmx.cpu_cr3 = c->cr3;
+       VMX_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx", c->cr3);
+       __vmwrite(GUEST_CR3, pagetable_val(d->arch.shadow_table));
+    }
+
+    error |= __vmread(CR4_READ_SHADOW, &old_cr4);
+    error |= __vmwrite(GUEST_CR4, (c->cr4 | X86_CR4_VMXE));
+    error |= __vmwrite(CR4_READ_SHADOW, c->cr4);
+
+    error |= __vmwrite(GUEST_IDTR_LIMIT, c->idtr_limit);
+    error |= __vmwrite(GUEST_IDTR_BASE, c->idtr_base);
+
+    error |= __vmwrite(GUEST_GDTR_LIMIT, c->gdtr_limit);
+    error |= __vmwrite(GUEST_GDTR_BASE, c->gdtr_base);
+
+    error |= __vmwrite(GUEST_CS_SELECTOR, c->cs_sel);
+    error |= __vmwrite(GUEST_CS_LIMIT, c->cs_limit);
+    error |= __vmwrite(GUEST_CS_BASE, c->cs_base);
+    error |= __vmwrite(GUEST_CS_AR_BYTES, c->cs_arbytes.bytes);
+
+    error |= __vmwrite(GUEST_DS_SELECTOR, c->ds_sel);
+    error |= __vmwrite(GUEST_DS_LIMIT, c->ds_limit);
+    error |= __vmwrite(GUEST_DS_BASE, c->ds_base);
+    error |= __vmwrite(GUEST_DS_AR_BYTES, c->ds_arbytes.bytes);
+
+    error |= __vmwrite(GUEST_ES_SELECTOR, c->es_sel);
+    error |= __vmwrite(GUEST_ES_LIMIT, c->es_limit);
+    error |= __vmwrite(GUEST_ES_BASE, c->es_base);
+    error |= __vmwrite(GUEST_ES_AR_BYTES, c->es_arbytes.bytes);
+
+    error |= __vmwrite(GUEST_SS_SELECTOR, c->ss_sel);
+    error |= __vmwrite(GUEST_SS_LIMIT, c->ss_limit);
+    error |= __vmwrite(GUEST_SS_BASE, c->ss_base);
+    error |= __vmwrite(GUEST_SS_AR_BYTES, c->ss_arbytes.bytes);
+
+    error |= __vmwrite(GUEST_FS_SELECTOR, c->fs_sel);
+    error |= __vmwrite(GUEST_FS_LIMIT, c->fs_limit);
+    error |= __vmwrite(GUEST_FS_BASE, c->fs_base);
+    error |= __vmwrite(GUEST_FS_AR_BYTES, c->fs_arbytes.bytes);
+
+    error |= __vmwrite(GUEST_GS_SELECTOR, c->gs_sel);
+    error |= __vmwrite(GUEST_GS_LIMIT, c->gs_limit);
+    error |= __vmwrite(GUEST_GS_BASE, c->gs_base);
+    error |= __vmwrite(GUEST_GS_AR_BYTES, c->gs_arbytes.bytes);
+
+    error |= __vmwrite(GUEST_TR_SELECTOR, c->tr_sel);
+    error |= __vmwrite(GUEST_TR_LIMIT, c->tr_limit);
+    error |= __vmwrite(GUEST_TR_BASE, c->tr_base);
+    error |= __vmwrite(GUEST_TR_AR_BYTES, c->tr_arbytes.bytes);
+
+    error |= __vmwrite(GUEST_LDTR_SELECTOR, c->ldtr_sel);
+    error |= __vmwrite(GUEST_LDTR_LIMIT, c->ldtr_limit);
+    error |= __vmwrite(GUEST_LDTR_BASE, c->ldtr_base);
+    error |= __vmwrite(GUEST_LDTR_AR_BYTES, c->ldtr_arbytes.bytes);
+
+    return !error;
+}
+
+enum { VMX_ASSIST_INVOKE = 0, VMX_ASSIST_RESTORE };
+
+int
+vmx_assist(struct exec_domain *d, int mode)
+{
+    struct vmx_assist_context c;
+    unsigned long magic, cp;
+
+    /* make sure vmxassist exists (this is not an error) */
+    if (!vmx_copy(&magic, VMXASSIST_MAGIC_OFFSET, sizeof(magic), COPY_IN))
+       return 0;
+    if (magic != VMXASSIST_MAGIC)
+       return 0;
+
+    switch (mode) {
+    /*
+     * Transfer control to vmxassist.
+     * Store the current context in VMXASSIST_OLD_CONTEXT and load
+     * the new VMXASSIST_NEW_CONTEXT context. This context was created
+     * by vmxassist and will transfer control to it.
+     */
+    case VMX_ASSIST_INVOKE:
+       /* save the old context */
+       if (!vmx_copy(&cp, VMXASSIST_OLD_CONTEXT, sizeof(cp), COPY_IN))
+           goto error;
+       if (cp != 0) {
+           if (!vmx_world_save(d, &c))
+               goto error;
+           if (!vmx_copy(&c, cp, sizeof(c), COPY_OUT))
+               goto error;
+       }
+
+       /* restore the new context, this should activate vmxassist */
+       if (!vmx_copy(&cp, VMXASSIST_NEW_CONTEXT, sizeof(cp), COPY_IN))
+           goto error;
+       if (cp != 0) {
+            if (!vmx_copy(&c, cp, sizeof(c), COPY_IN))
+               goto error;
+           if (!vmx_world_restore(d, &c))
+               goto error;
+           return 1;
+       }
+       break;
+
+    /*
+     * Restore the VMXASSIST_OLD_CONTEXT that was saved by VMX_ASSIST_INVOKE
+     * above.
+     */
+    case VMX_ASSIST_RESTORE:
+       /* save the old context */
+       if (!vmx_copy(&cp, VMXASSIST_OLD_CONTEXT, sizeof(cp), COPY_IN))
+           goto error;
+       if (cp != 0) {
+            if (!vmx_copy(&c, cp, sizeof(c), COPY_IN))
+               goto error;
+           if (!vmx_world_restore(d, &c))
+               goto error;
+           return 1;
+       }
+       break;
+    }
+
+error:
+    printf("Failed to transfer to vmxassist\n");
+    domain_crash_synchronous(); 
     return 0;
 }
 
@@ -399,6 +651,7 @@ static int mov_to_cr(int gp, int cr, struct xen_regs *regs)
 {
     unsigned long value;
     unsigned long old_cr;
+    unsigned long eip;
     struct exec_domain *d = current;
 
     switch (gp) {
@@ -469,15 +722,28 @@ static int mov_to_cr(int gp, int cr, struct xen_regs *regs)
             put_page_and_type(&frame_table[old_base_mfn]);
         } else {
             if ((value & X86_CR0_PE) == 0) {
-               unsigned long eip;
-
                __vmread(GUEST_EIP, &eip);
                 VMX_DBG_LOG(DBG_LEVEL_1,
                        "Disabling CR0.PE at %%eip 0x%lx", eip);
-               if (vm86assist(d)) {
+               if (vmx_assist(d, VMX_ASSIST_INVOKE)) {
+                   set_bit(VMX_CPU_STATE_ASSIST_ENABLED,
+                                               &d->arch.arch_vmx.cpu_state);
                    __vmread(GUEST_EIP, &eip);
                    VMX_DBG_LOG(DBG_LEVEL_1,
-                       "Transfering control to vm86assist %%eip 0x%lx", eip);
+                       "Transfering control to vmxassist %%eip 0x%lx", eip);
+                   return 0; /* do not update eip! */
+               }
+           } else if (test_bit(VMX_CPU_STATE_ASSIST_ENABLED,
+                                       &d->arch.arch_vmx.cpu_state)) {
+               __vmread(GUEST_EIP, &eip);
+               VMX_DBG_LOG(DBG_LEVEL_1,
+                       "Enabling CR0.PE at %%eip 0x%lx", eip);
+               if (vmx_assist(d, VMX_ASSIST_RESTORE)) {
+                   clear_bit(VMX_CPU_STATE_ASSIST_ENABLED,
+                                               &d->arch.arch_vmx.cpu_state);
+                   __vmread(GUEST_EIP, &eip);
+                   VMX_DBG_LOG(DBG_LEVEL_1,
+                       "Restoring to %%eip 0x%lx", eip);
                    return 0; /* do not update eip! */
                }
            }
@@ -549,6 +815,7 @@ static int mov_to_cr(int gp, int cr, struct xen_regs *regs)
          */
         if ((old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE)) {
             vmx_shadow_clear_state(d->domain);
+            shadow_sync_all(d->domain);
         }
         break;
     default:
index 4f5d1114ec6dfa90da14ead0f9178b47915050b0..5649597e97b5a55ee2efa94d2ae6804a58f7e33e 100644 (file)
@@ -484,6 +484,11 @@ static void send_mmio_req(unsigned long gpa,
 
     vm86 = inst_decoder_regs->eflags & X86_EFLAGS_VM;
 
+    if (test_bit(ARCH_VMX_IO_WAIT, &d->arch.arch_vmx.flags)) {
+        printf("VMX I/O has not yet completed\n");
+        domain_crash_synchronous();
+    }
+
     set_bit(ARCH_VMX_IO_WAIT, &d->arch.arch_vmx.flags);
     p->dir = dir;
     p->pdata_valid = pvalid;
index 80e6f7ad9c6c788c9aead4670a31f0c6a6120580..b972a17011032500428725d6ee4a28dce1ee54f8 100644 (file)
 #include <asm/config.h>
 #include <asm/vmx_cpu.h>
 #include <asm/vmx_platform.h>
+#include <public/vmx_assist.h>
 
 extern int start_vmx(void);
 extern void stop_vmx(void);
 
 void vmx_enter_scheduler(void);
 
-union vmcs_arbytes {
-    struct arbyte_fields {
-        unsigned int 
-        seg_type: 4, s: 1, dpl: 2, p: 1, 
-        reserved0: 4, avl: 1, reserved1: 1,     
-        default_ops_size: 1, g: 1, null_bit: 1, 
-        reserved2: 15;
-    }  __attribute__((packed)) fields;
-    unsigned int bytes;
-};
-
 #define VMX_CPU_STATE_PG_ENABLED        0       
+#define        VMX_CPU_STATE_ASSIST_ENABLED    1
 #define VMCS_SIZE                       0x1000
 
 struct vmcs_struct {
diff --git a/xen/include/public/vmx_assist.h b/xen/include/public/vmx_assist.h
new file mode 100644 (file)
index 0000000..6bb8c35
--- /dev/null
@@ -0,0 +1,101 @@
+/*
+ * vmx_assist.h: Context definitions for the VMXASSIST world switch.
+ *
+ * Leendert van Doorn, leendert@watson.ibm.com
+ * Copyright (c) 2005, International Business Machines Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+#ifndef _VMX_ASSIST_H_
+#define _VMX_ASSIST_H_
+
+#define        VMXASSIST_BASE          0xE0000
+#define        VMXASSIST_MAGIC         0x17101966
+#define        VMXASSIST_MAGIC_OFFSET  (VMXASSIST_BASE+8)
+
+#define        VMXASSIST_NEW_CONTEXT   (VMXASSIST_BASE + 12)
+#define        VMXASSIST_OLD_CONTEXT   (VMXASSIST_NEW_CONTEXT + 4)
+
+#ifndef __ASSEMBLY__
+
+union vmcs_arbytes {
+       struct arbyte_fields {
+               unsigned int    seg_type        : 4,
+                               s               : 1,
+                               dpl             : 2,
+                               p               : 1, 
+                               reserved0       : 4,
+                               avl             : 1,
+                               reserved1       : 1,     
+                               default_ops_size: 1,
+                               g               : 1,
+                               null_bit        : 1, 
+                               reserved2       : 15;
+       }  __attribute__((packed)) fields;
+       unsigned int bytes;
+};
+
+/*
+ * World switch state
+ */
+typedef struct vmx_assist_context {
+       unsigned long           eip;            /* execution pointer */
+       unsigned long           esp;            /* stack point */
+       unsigned long           eflags;         /* flags register */
+       unsigned long           cr0;
+       unsigned long           cr3;            /* page table directory */
+       unsigned long           cr4;
+       unsigned long           idtr_limit;     /* idt */
+       unsigned long           idtr_base;
+       unsigned long           gdtr_limit;     /* gdt */
+       unsigned long           gdtr_base;
+       unsigned long           cs_sel;         /* cs selector */
+       unsigned long           cs_limit;
+       unsigned long           cs_base;
+       union vmcs_arbytes      cs_arbytes;
+       unsigned long           ds_sel;         /* ds selector */
+       unsigned long           ds_limit;
+       unsigned long           ds_base;
+       union vmcs_arbytes      ds_arbytes;
+       unsigned long           es_sel;         /* es selector */
+       unsigned long           es_limit;
+       unsigned long           es_base;
+       union vmcs_arbytes      es_arbytes;
+       unsigned long           ss_sel;         /* ss selector */
+       unsigned long           ss_limit;
+       unsigned long           ss_base;
+       union vmcs_arbytes      ss_arbytes;
+       unsigned long           fs_sel;         /* fs selector */
+       unsigned long           fs_limit;
+       unsigned long           fs_base;
+       union vmcs_arbytes      fs_arbytes;
+       unsigned long           gs_sel;         /* gs selector */
+       unsigned long           gs_limit;
+       unsigned long           gs_base;
+       union vmcs_arbytes      gs_arbytes;
+       unsigned long           tr_sel;         /* task selector */
+       unsigned long           tr_limit;
+       unsigned long           tr_base;
+       union vmcs_arbytes      tr_arbytes;
+       unsigned long           ldtr_sel;       /* ldtr selector */
+       unsigned long           ldtr_limit;
+       unsigned long           ldtr_base;
+       union vmcs_arbytes      ldtr_arbytes;
+} vmx_assist_context_t;
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* _VMX_ASSIST_H_ */
+